Pokemon Go analysis

Just a document to mess around with some data and .rmd documents.

# Load the data from googlesheets (requires authorisation): --------------------
df <- read_sheet("https://docs.google.com/spreadsheets/d/1EWzGk_qDK8ommXYz2jxYvFSSEzj9Wal976dWRwR4_0w/edit?usp=sharing",
                 #24, 
                 sheet = "Data", 
                 trim_ws = TRUE)

# Minor data tidy up: ----------------------------------------------------------
# Treat following as factors
df$player <- factor(df$player)
df$num_evo <- factor(df$number_evolutions) # Could be treated as number
df$patch <- factor(df$patch)
df$pokemon <- factor(df$pokemon)
df$type <- factor(df$type)
df$type_2 <- factor(df$type_2)
# Log difference growth rate in cp
df$cp_lambda <- log(df$final_cp) - log(df$starting_cp)
# Changing text shorthand for full word
df$evolve_stone <- factor(ifelse(df$evolve_stone == "y", "Yes", "No"))
df$special <- factor(ifelse(df$special == "y", "Yes", "No"))

Summary figures

df2 <- df |> 
  group_by(pokemon) |> 
  summarise(pokemon_freq = n())
ggplot(df2[df2$pokemon_freq > 10,], aes(x = reorder(pokemon, pokemon_freq), y = pokemon_freq)) +
  geom_col() +
  labs(x = "Pokemon",
       y = "Number of data entries",
       title = "Most common pokemon in the data") +
  theme(axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5, 
                                   hjust = 1))

df2 <- df |> 
  group_by(pokemon) |> 
  summarise(mean_start = mean(starting_cp))
ggplot(df2[df2$mean_start > 1000,], aes(x = reorder(pokemon, mean_start), y = mean_start)) +
  geom_point() +
  labs(x = "Pokemon",
       y = "Mean starting CP",
       title = "Pokemon with highest average starting CP") +
  theme(axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5, 
                                   hjust = 1))

ggplot(df2[df2$mean_start < 300,], aes(x = reorder(pokemon, mean_start), y = mean_start)) +
  geom_point() +
  labs(x = "Pokemon",
       y = "Mean starting CP",
       title = "Pokemon with lowest average starting CP") +
  theme(axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5, 
                                   hjust = 1))

ggplotly(ggplot(df, aes(x = starting_cp, y = final_cp, colour = cp_diff, text = pokemon)) +
           geom_point() +
           labs(x = "Starting CP",
                y = "Final CP",
                colour = "Change in CP",
                title = "Pokemon with lowest average starting CP") +
           scale_colour_viridis_c(option = "C")
)
ggplotly(ggplot(df, aes(x = starting_cp, y = final_cp, colour = cost_evolve, text = pokemon)) +
           geom_point() +
           labs(x = "Starting CP",
                y = "Final CP",
                colour = "Cost to evolve",
                title = "Pokemon with lowest average starting CP") +
           scale_colour_viridis_c(option = "C")
)
ggplotly(ggplot(df, aes(x = starting_cp, y = final_cp, colour = player_level, text = pokemon)) +
           geom_point() +
           labs(x = "Starting CP",
                y = "Final CP",
                colour = "Player level",
                title = "Pokemon with lowest average starting CP") +
           scale_colour_viridis_c(option = "C")
)

Analysis

Above figures suggest strong linear relationship between starting CP and final CP but with interacting effects of player level and cost to evolve. Will include random effects for pokemon (to account for varied effort and partially pool pokemon together), primary type, secondary type. Will need to be log-Normal model so that final CP is forced to be positive.

m1 <- lmer(cp_diff ~ starting_cp * player_level * cost_evolve + 
              (1 | pokemon),
           data = df)
summary(m1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cp_diff ~ starting_cp * player_level * cost_evolve + (1 | pokemon)
##    Data: df
## 
## REML criterion at convergence: 8392
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.2717 -0.2637  0.0112  0.2958  5.4328 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  pokemon  (Intercept) 93972    306.5   
##  Residual             27489    165.8   
## Number of obs: 609, groups:  pokemon, 191
## 
## Fixed effects:
##                                        Estimate Std. Error t value
## (Intercept)                          -2.752e+02  1.961e+02  -1.403
## starting_cp                           1.826e+00  4.374e-01   4.176
## player_level                          6.649e+00  6.101e+00   1.090
## cost_evolve                           8.103e+00  3.251e+00   2.492
## starting_cp:player_level             -2.324e-02  1.456e-02  -1.597
## starting_cp:cost_evolve              -2.386e-02  4.291e-03  -5.561
## player_level:cost_evolve             -1.711e-01  8.345e-02  -2.051
## starting_cp:player_level:cost_evolve  6.632e-04  1.131e-04   5.865
## 
## Correlation of Fixed Effects:
##              (Intr) strtn_ plyr_l cst_vl strtng_cp:p_ strtng_cp:c_ ply_:_
## starting_cp  -0.810                                                      
## player_levl  -0.962  0.843                                               
## cost_evolve  -0.593  0.281  0.446                                        
## strtng_cp:p_  0.775 -0.986 -0.839 -0.196                                 
## strtng_cp:c_  0.591 -0.605 -0.497 -0.700  0.505                          
## plyr_lvl:c_   0.605 -0.295 -0.484 -0.985  0.220        0.669             
## strtng_:_:_  -0.614  0.663  0.545  0.647 -0.583       -0.984       -0.631
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
plot(ggpredict(m1))
## $starting_cp

## 
## $player_level

## 
## $cost_evolve